Short vs. long RNA-seq (differentiation over timecourse)

Author

Michel Tarnow

Published

December 23, 2023

Load packages

library(tidyverse)
library(RColorBrewer)
library(edgeR)
library(UpSetR)
library(viridis)
library(pheatmap)
library(gplots)
library(fgsea)
library(msigdbr)

Load data

df_list_bambu <- readRDS("../bambu.rds")
df_list_salmon <- readRDS("../salmon.rds")
df_list_meta <- readRDS("../meta.rds")
bambu_0.025_gene <- df_list_bambu$bambu_0.025_gene
bambu_0.05_gene <- df_list_bambu$bambu_0.05_gene
bambu_0.1_gene <- df_list_bambu$bambu_0.1_gene
bambu_0.2_gene <- df_list_bambu$bambu_0.2_gene

salmon_0.025_gene <- df_list_salmon$salmon_0.025_gene
salmon_0.05_gene <- df_list_salmon$salmon_0.05_gene
salmon_0.1_gene <- df_list_salmon$salmon_0.1_gene
salmon_0.2_gene <- df_list_salmon$salmon_0.2_gene

meta_samples <- df_list_meta$metadata

Create DGEList objects and filter expressed genes

day <- factor(meta_samples$time)

0.025

y_salmon_0.025 <- DGEList(counts = salmon_0.025_gene, group = day)
y_bambu_0.025 <- DGEList(counts = bambu_0.025_gene, group = day)

expressed_salmon <- filterByExpr(y_salmon_0.025)
expressed_bambu <- filterByExpr(y_bambu_0.025)

not_expressed <-
  intersect(names(expressed_salmon[!expressed_salmon]), names(expressed_bambu[!expressed_bambu]))

if (length(not_expressed) > 0) {
  y_salmon_0.025 <-
    y_salmon_0.025[-which(rownames(salmon_0.025_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
  y_bambu_0.025 <-
    y_bambu_0.025[-which(rownames(bambu_0.025_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
}

0.05

y_salmon_0.05 <- DGEList(counts = salmon_0.05_gene, group = day)
y_bambu_0.05 <- DGEList(counts = bambu_0.05_gene, group = day)

expressed_salmon <- filterByExpr(y_salmon_0.05)
expressed_bambu <- filterByExpr(y_bambu_0.05)

not_expressed <-
  intersect(names(expressed_salmon[!expressed_salmon]), names(expressed_bambu[!expressed_bambu]))

if (length(not_expressed) > 0) {
  y_salmon_0.05 <-
    y_salmon_0.05[-which(rownames(salmon_0.05_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
  y_bambu_0.05 <-
    y_bambu_0.05[-which(rownames(bambu_0.05_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
}

0.1

y_salmon_0.1 <- DGEList(counts = salmon_0.1_gene, group = day)
y_bambu_0.1 <- DGEList(counts = bambu_0.1_gene, group = day)

expressed_salmon <- filterByExpr(y_salmon_0.1)
expressed_bambu <- filterByExpr(y_bambu_0.1)

not_expressed <-
  intersect(names(expressed_salmon[!expressed_salmon]), names(expressed_bambu[!expressed_bambu]))

if (length(not_expressed) > 0) {
  y_salmon_0.1 <-
    y_salmon_0.1[-which(rownames(salmon_0.1_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
  y_bambu_0.1 <-
    y_bambu_0.1[-which(rownames(bambu_0.1_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
}

0.2

y_salmon_0.2 <- DGEList(counts = salmon_0.2_gene, group = day)
y_bambu_0.2 <- DGEList(counts = bambu_0.2_gene, group = day)

expressed_salmon <- filterByExpr(y_salmon_0.2)
expressed_bambu <- filterByExpr(y_bambu_0.2)

not_expressed <-
  intersect(names(expressed_salmon[!expressed_salmon]), names(expressed_bambu[!expressed_bambu]))

if (length(not_expressed) > 0) {
  y_salmon_0.2 <-
    y_salmon_0.2[-which(rownames(salmon_0.2_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
  y_bambu_0.2 <-
    y_bambu_0.2[-which(rownames(bambu_0.2_gene) %in% not_expressed), , keep.lib.sizes = FALSE]
}
dim(y_salmon_0.025) == dim(y_bambu_0.025)
[1] TRUE TRUE
dim(y_salmon_0.05) == dim(y_bambu_0.05)
[1] TRUE TRUE
dim(y_salmon_0.1) == dim(y_bambu_0.1)
[1] TRUE TRUE
dim(y_salmon_0.2) == dim(y_bambu_0.2)
[1] TRUE TRUE

Differential gene expression analysis

# create design matrix
design <- model.matrix( ~ day)
design
   (Intercept) day1 day2 day3 day4 day5
1            1    0    0    0    0    0
2            1    0    0    0    0    0
3            1    0    0    0    0    0
4            1    1    0    0    0    0
5            1    0    1    0    0    0
6            1    0    0    1    0    0
7            1    0    0    1    0    0
8            1    0    0    0    1    0
9            1    0    0    0    0    1
10           1    0    0    0    0    1
11           1    0    0    0    0    1
attr(,"assign")
[1] 0 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$day
[1] "contr.treatment"

0.025

# Salmon 0.025
y_salmon_0.025 <- calcNormFactors(y_salmon_0.025)
## estimate dispersion
y_salmon_0.025 <- estimateDisp(y_salmon_0.025, design)
## fit the negative binomial model
fit <- glmFit(y_salmon_0.025, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtS <- glmLRT(fit, coef = 2:6)
#topTags(lrt)
## multiple testing correction
decide_dif_s0.025 <-
  decideTests.DGELRT(lrtS,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_s0.025)
       day5-day4-day3-day2-day1
NotSig                    17374
Sig                        7343
# Bambu 0.025
y_bambu_0.025 <- calcNormFactors(y_bambu_0.025)
## estimate dispersion
y_bambu_0.025 <- estimateDisp(y_bambu_0.025, design)
## fit the negative binomial model
fit <- glmFit(y_bambu_0.025, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtB <- glmLRT(fit, coef = 2:6)
#topTags(lrt)
## multiple testing correction
decide_dif_b0.025 <-
  decideTests.DGELRT(lrtB,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_b0.025)
       day5-day4-day3-day2-day1
NotSig                    16879
Sig                        7838
# overlay results of both technologies

de_genes_0.025 <- as.data.frame(cbind(decide_dif_s0.025, decide_dif_b0.025))
colnames(de_genes_0.025) <- c("Salmon_0.025", "Bambu_0.025")
upset(de_genes_0.025, sets = colnames(de_genes_0.025))

# comparison of padj and logfc

padjS <- p.adjust(lrtS$table$PValue, method = "BH")
padjB <- p.adjust(lrtB$table$PValue, method = "BH")
padj <- data.frame(Salmon = padjS, Bambu = padjB)

logfcS <- lrtS$table$logFC.day5
logfcB <- lrtB$table$logFC.day5
logfc <- data.frame(Salmon = logfcS, Bambu = logfcB)

ggplot(data = padj, mapping = aes(x = -log10(Salmon), y = -log10(Bambu))) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of P-values (NDR=0.025)", x = "-log10 P-value (Salmon)", y = "-log10 P-value (Bambu)")

ggplot(data = logfc, mapping = aes(x = Salmon, y = Bambu)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of LogFC (NDR=0.025)", x = "LogFC (Salmon)", y = "LogFC (Bambu)")

# time point comparison

## Salmon
fit <- glmFit(y_salmon_0.025, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_s0.025_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_s0.025_tp)
        day5
Down    2377
NotSig 18283
Up      4057
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_s0.025_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Salmon 0.025 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

## Bambu
fit <- glmFit(y_bambu_0.025, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_b0.025_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_b0.025_tp)
        day5
Down    2014
NotSig 19559
Up      3144
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_b0.025_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Bambu 0.025 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

0.05

# Salmon 0.05
y_salmon_0.05 <- calcNormFactors(y_salmon_0.05)
## estimate dispersion
y_salmon_0.05 <- estimateDisp(y_salmon_0.05, design)
## fit the negative binomial model
fit <- glmFit(y_salmon_0.05, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtS <- glmLRT(fit, coef = 2:6)
#topTags(lrt)
## multiple testing correction
decide_dif_s0.05 <-
  decideTests.DGELRT(lrtS,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_s0.05)
       day5-day4-day3-day2-day1
NotSig                    17444
Sig                        7367
# Bambu 0.05
y_bambu_0.05 <- calcNormFactors(y_bambu_0.05)
## estimate dispersion
y_bambu_0.05 <- estimateDisp(y_bambu_0.05, design)
## fit the negative binomial model
fit <- glmFit(y_bambu_0.05, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtB <- glmLRT(fit, coef = 2:6)
#topTags(lrt)
## multiple testing correction
decide_dif_b0.05 <-
  decideTests.DGELRT(lrtB,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_b0.05)
       day5-day4-day3-day2-day1
NotSig                    16927
Sig                        7884
# overlay results of both technologies

de_genes_0.05 <- as.data.frame(cbind(decide_dif_s0.05, decide_dif_b0.05))
colnames(de_genes_0.05) <- c("Salmon_0.05", "Bambu_0.05")
upset(de_genes_0.05, sets = colnames(de_genes_0.05))

# comparison of padj and logfc

padjS <- p.adjust(lrtS$table$PValue, method = "BH")
padjB <- p.adjust(lrtB$table$PValue, method = "BH")
padj <- data.frame(Salmon = padjS, Bambu = padjB)

logfcS <- lrtS$table$logFC.day5
logfcB <- lrtB$table$logFC.day5
logfc <- data.frame(Salmon = logfcS, Bambu = logfcB)

ggplot(data = padj, mapping = aes(x = -log10(Salmon), y = -log10(Bambu))) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of P-values (NDR=0.05)", x = "-log10 P-value (Salmon)", y = "-log10 P-value (Bambu)")

ggplot(data = logfc, mapping = aes(x = Salmon, y = Bambu)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of LogFC (NDR=0.05)", x = "LogFC (Salmon)", y = "LogFC (Bambu)")

# time point comparison

## Salmon
fit <- glmFit(y_salmon_0.05, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_s0.05_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_s0.05_tp)
        day5
Down    2388
NotSig 18352
Up      4071
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_s0.05_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Salmon 0.05 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

## Bambu
fit <- glmFit(y_bambu_0.05, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_b0.05_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_b0.05_tp)
        day5
Down    2031
NotSig 19608
Up      3172
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_b0.05_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Bambu 0.05 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

0.1

# Salmon 0.1
y_salmon_0.1 <- calcNormFactors(y_salmon_0.1)
## estimate dispersion
y_salmon_0.1 <- estimateDisp(y_salmon_0.1, design)
## fit the negative binomial model
fit <- glmFit(y_salmon_0.1, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtS <- glmLRT(fit, coef = 2:6)
#topTags(lrt)
## multiple testing correction
decide_dif_s0.1 <-
  decideTests.DGELRT(lrtS,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_s0.1)
       day5-day4-day3-day2-day1
NotSig                    17575
Sig                        7529
# Bambu 0.1
y_bambu_0.1 <- calcNormFactors(y_bambu_0.1)
## estimate dispersion
y_bambu_0.1 <- estimateDisp(y_bambu_0.1, design)
## fit the negative binomial model
fit <- glmFit(y_bambu_0.1, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtB <- glmLRT(fit, coef = 2:6)
#topTags(lrt)
## multiple testing correction
decide_dif_b0.1 <-
  decideTests.DGELRT(lrtB,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_b0.1)
       day5-day4-day3-day2-day1
NotSig                    17056
Sig                        8048
# overlay results of both technologies

de_genes_0.1 <- as.data.frame(cbind(decide_dif_s0.1, decide_dif_b0.1))
colnames(de_genes_0.1) <- c("Salmon_0.1", "Bambu_0.1")
upset(de_genes_0.1, sets = colnames(de_genes_0.1))

# comparison of padj and logfc

padjS <- p.adjust(lrtS$table$PValue, method = "BH")
padjB <- p.adjust(lrtB$table$PValue, method = "BH")
padj <- data.frame(Salmon = padjS, Bambu = padjB)

logfcS <- lrtS$table$logFC.day5
logfcB <- lrtB$table$logFC.day5
logfc <- data.frame(Salmon = logfcS, Bambu = logfcB)

ggplot(data = padj, mapping = aes(x = -log10(Salmon), y = -log10(Bambu))) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of P-values (NDR=0.1)", x = "-log10 P-value (Salmon)", y = "-log10 P-value (Bambu)")

ggplot(data = logfc, mapping = aes(x = Salmon, y = Bambu)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of LogFC (NDR=0.1)", x = "LogFC (Salmon)", y = "LogFC (Bambu)")

# time point comparison

## Salmon
fit <- glmFit(y_salmon_0.1, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_s0.1_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_s0.1_tp)
        day5
Down    2467
NotSig 18544
Up      4093
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_s0.1_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Salmon 0.1 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

## Bambu
fit <- glmFit(y_bambu_0.1, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_b0.1_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_b0.1_tp)
        day5
Down    2093
NotSig 19746
Up      3265
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_b0.1_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Bambu 0.1 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

0.2

# Salmon 0.2
y_salmon_0.2 <- calcNormFactors(y_salmon_0.2)
## estimate dispersion
y_salmon_0.2 <- estimateDisp(y_salmon_0.2, design)
## fit the negative binomial model
fit <- glmFit(y_salmon_0.2, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtS <- glmLRT(fit, coef = 2:6)
## save lrt for pathway analysis
lrt_pathway <- lrtS
#topTags(lrt)
## multiple testing correction
decide_dif_s0.2 <-
  decideTests.DGELRT(lrtS,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_s0.2)
       day5-day4-day3-day2-day1
NotSig                    18289
Sig                        7723
# Bambu 0.2
y_bambu_0.2 <- calcNormFactors(y_bambu_0.2)
## estimate dispersion
y_bambu_0.2 <- estimateDisp(y_bambu_0.2, design)
## fit the negative binomial model
fit <- glmFit(y_bambu_0.2, design)
## conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrtB <- glmLRT(fit, coef = 2:6)
#topTags(lrt)
## multiple testing correction
decide_dif_b0.2 <-
  decideTests.DGELRT(lrtB,
                     adjust.method = "BH",
                     p.value = 0.01)
summary(decide_dif_b0.2)
       day5-day4-day3-day2-day1
NotSig                    17667
Sig                        8345
# overlay results of both technologies

de_genes_0.2 <- as.data.frame(cbind(decide_dif_s0.2, decide_dif_b0.2))
colnames(de_genes_0.2) <- c("Salmon_0.2", "Bambu_0.2")
upset(de_genes_0.2, sets = colnames(de_genes_0.2))

# comparison of padj and logfc

padjS <- p.adjust(lrtS$table$PValue, method = "BH")
padjB <- p.adjust(lrtB$table$PValue, method = "BH")
padj <- data.frame(Salmon = padjS, Bambu = padjB)

logfcS <- lrtS$table$logFC.day5
logfcB <- lrtB$table$logFC.day5
logfc <- data.frame(Salmon = logfcS, Bambu = logfcB)

ggplot(data = padj, mapping = aes(x = -log10(Salmon), y = -log10(Bambu))) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of P-values (NDR=0.2)", x = "-log10 P-value (Salmon)", y = "-log10 P-value (Bambu)")

ggplot(data = logfc, mapping = aes(x = Salmon, y = Bambu)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis() +
  labs(title = "Comparison of LogFC (NDR=0.2)", x = "LogFC (Salmon)", y = "LogFC (Bambu)")

# time point comparison

## Salmon
fit <- glmFit(y_salmon_0.2, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_s0.2_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_s0.2_tp)
        day5
Down    2556
NotSig 19230
Up      4226
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_s0.2_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Salmon 0.2 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

## Bambu
fit <- glmFit(y_bambu_0.2, design)
lrt <- glmLRT(fit, coef = 6)
decide_dif_b0.2_tp <-
  decideTests.DGELRT(lrt,
                     adjust.method = "BH",
                     p.value = 0.01,
                     lfc = log2(2))
summary(decide_dif_b0.2_tp)
        day5
Down    2193
NotSig 20389
Up      3430
ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = logFC,
    y = -log10(p.adjust(PValue, method = "BH")),
    color = as.factor(decide_dif_b0.2_tp)
  )
) +
  geom_point() +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
  labs(
    title = "Differential gene expression Bambu 0.2 Day5-Day0",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_viridis_d() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Grouping of DE genes

s <- which(de_genes_0.2$Salmon_0.2 == 1)
b <- which(de_genes_0.2$Bambu_0.2 == 1)
s_b <- intersect(s, b)
DEG <- rownames(de_genes_0.2[s_b,])
avg_expr <- as.data.frame(sapply(sort(unique(meta_samples$time))[c(1,4,6)], function(time) {
  rowMeans(salmon_0.2_gene[, which(meta_samples$time == time)])
}))

avg_expr$V4 <- unlist(as.vector(salmon_0.2_gene[, which(meta_samples$time == 1)]))
avg_expr$V5 <- unlist(as.vector(salmon_0.2_gene[, which(meta_samples$time == 2)]))
avg_expr$V6 <- unlist(as.vector(salmon_0.2_gene[, which(meta_samples$time == 4)]))

avg_expr <- avg_expr[, c(1,4,5,2,6,3)]

colnames(avg_expr) <- sort(unique(meta_samples$time))
corr_DEG <- cor(t(avg_expr[DEG,]), method = "spearman")
hcl_DEG <- hclust(as.dist(1 - corr_DEG), method = "complete")
plot(hcl_DEG, label = FALSE)

cl_DEG <- cutree(hcl_DEG, k = 4)
heatmap.2(corr_DEG, Rowv = as.dendrogram(hcl_DEG), Colv = as.dendrogram(hcl_DEG),
          trace = "none", scale = "none", labRow = NA, labCol = NA, col = viridis,
          ColSideColors = rainbow(15)[cl_DEG])

avg_expr_DEG_list <- tapply(names(cl_DEG), cl_DEG, function(x) avg_expr[x,])
scaled_expr_DEG_list <- lapply(avg_expr_DEG_list, function(x) t(scale(t(x))))

layout(matrix(1:4, nrow = 2, byrow = T))
for(cl in 1:4)
  boxplot(scaled_expr_DEG_list[[cl]],
          main = paste0(cl, " (", nrow(scaled_expr_DEG_list[[cl]]), ")"))

Function/Pathway analysis

DAVID (frequency-based)

get_ensemble_id <- function(input_string) {
  ensemble_id_version <- unlist(strsplit(input_string, "\\."))
  return(ensemble_id_version[1])
}
input <- names(which(cl_DEG == 1))
names_cluster1 <- unname(sapply(input, get_ensemble_id))

input <- names(which(cl_DEG == 2))
names_cluster2 <- unname(sapply(input, get_ensemble_id))

input <- names(which(cl_DEG == 3))
names_cluster3 <- unname(sapply(input, get_ensemble_id))

input <- names(which(cl_DEG == 4))
names_cluster4 <- unname(sapply(input, get_ensemble_id))

input <- rownames(y_salmon_0.025)
background <- unname(sapply(input, get_ensemble_id))
write.table(
  names_cluster1[-which(startsWith(names_cluster1, "Bambu"))],
  file = "../genes_C1.txt",
  quote = F,
  row.names = F,
  col.names = F
)

write.table(
  names_cluster2,
  file = "../genes_C2.txt",
  quote = F,
  row.names = F,
  col.names = F
)

write.table(
  names_cluster3,
  file = "../genes_C3.txt",
  quote = F,
  row.names = F,
  col.names = F
)

write.table(
  names_cluster4,
  file = "../genes_C4.txt",
  quote = F,
  row.names = F,
  col.names = F
)

write.table(
  background[-which(startsWith(background, "Bambu"))],
  file = "../genes_expressed.txt",
  quote = F,
  row.names = F,
  col.names = F
)

GSEA (ranked-based)

top_genes <- topTags(lrt_pathway, n = nrow(y_salmon_0.2))
padj <- p.adjust(p = top_genes$table$PValue, method = "BH")
ranked_list_names <- unname(sapply(rownames(top_genes), get_ensemble_id))

ranked_list <- padj
names(ranked_list) <- ranked_list_names
genesets <- msigdbr(species = "Homo sapiens", category = "C8")
genesets_list <- tapply(genesets$ensembl_gene, genesets$gs_name, list)

fgsea_kegg <- fgsea(
  pathways = genesets_list,
  stats = ranked_list,
  scoreType = "pos",
  minSize  = 15,
  maxSize  = 500
)
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (26.66% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
fgsea_kegg[order(NES, decreasing = T), ][1:10, 1:7]
                                            pathway         pval         padj
 1:                         FAN_EMBRYONIC_CTX_NSC_1 8.929877e-04 0.0302623602
 2: DESCARTES_FETAL_LUNG_VASCULAR_ENDOTHELIAL_CELLS 4.349771e-05 0.0037905144
 3:                       TRAVAGLINI_LUNG_CLUB_CELL 1.172221e-06 0.0003114217
 4:           BUSSLINGER_GASTRIC_LYZ_POSITIVE_CELLS 1.829102e-05 0.0018595867
 5:              RUBENSTEIN_SKELETAL_MUSCLE_B_CELLS 3.240715e-07 0.0001976836
 6:                TRAVAGLINI_LUNG_CD4_NAIVE_T_CELL 3.629272e-06 0.0005534640
 7:      TRAVAGLINI_LUNG_CD8_MEMORY_EFFECTOR_T_CELL 7.547702e-03 0.1587619992
 8:       FAN_OVARY_CL0_XBP1_SELK_HIGH_STROMAL_CELL 4.597052e-06 0.0005608403
 9:               AIZARANI_LIVER_C12_NK_NKT_CELLS_4 5.316876e-03 0.1201220058
10:              RUBENSTEIN_SKELETAL_MUSCLE_T_CELLS 1.531582e-06 0.0003114217
      log2err        ES      NES size
 1: 0.4772708 0.7915633 1.453507   17
 2: 0.5573322 0.7044832 1.378827   46
 3: 0.6435518 0.6366332 1.286296  106
 4: 0.5756103 0.6283243 1.266456   98
 5: 0.6749629 0.6118143 1.250337  163
 6: 0.6272567 0.6153934 1.249632  128
 7: 0.4070179 0.6579373 1.241656   26
 8: 0.6105269 0.6057760 1.234454  143
 9: 0.4070179 0.6299482 1.230173   44
10: 0.6435518 0.5981993 1.225006  177